{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import random\n",
    "import tqdm\n",
    "import jsonlines\n",
    "from datetime import datetime"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "contents = []\n",
    "with open('/<YOUR_OWN_PATH>/ToolQA/data/raw_data/agenda/agenda_events.jsonl', 'r') as f:\n",
    "    for item in jsonlines.Reader(f):\n",
    "        contents.append(item)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "df = pd.DataFrame(contents)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                      event start_time  end_time                location  \\\n",
      "0         Breakfast meeting    9:00 AM  10:00 AM           Bluebird Cafe   \n",
      "1  Sales pitch presentation   10:30 AM  11:30 AM          Hilton Chicago   \n",
      "2             Cooking class   11:00 AM   1:00 PM           Sur La Table    \n",
      "3     Lunch with colleagues    1:00 PM   2:00 PM  The Cheesecake Factory   \n",
      "4        Charity fundraiser    2:30 PM   4:30 PM        The Ritz-Carlton   \n",
      "\n",
      "    person        date           id  \n",
      "0      Mia  2022/12/15  agenda-0000  \n",
      "1  William  2022/07/05  agenda-0001  \n",
      "2    Emily  2022/03/04  agenda-0002  \n",
      "3     Adam  2022/12/02  agenda-0003  \n",
      "4    Layla  2022/08/21  agenda-0004  \n"
     ]
    }
   ],
   "source": [
    "print(df.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1:00:00\n"
     ]
    }
   ],
   "source": [
    "def compute_duration(start_time: str, end_time: str, time_format: str = \"%I:%M %p\") -> str:\n",
    "    start_time_obj = datetime.strptime(start_time, time_format)\n",
    "    end_time_obj = datetime.strptime(end_time, time_format)\n",
    "\n",
    "    duration = end_time_obj - start_time_obj\n",
    "\n",
    "    return str(duration)\n",
    "\n",
    "\n",
    "start_time = df.iloc[0][\"start_time\"]\n",
    "end_time = df.iloc[0][\"end_time\"]\n",
    "print(compute_duration(start_time, end_time))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "question_id = 0\n",
    "questions = []\n",
    "num_question_per_template = 20"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 20/20 [00:00<00:00, 764.88it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'qid': 'hard-agenda-0019', 'question': 'How many events happen on 2022/05/06 in the agenda table?', 'answer': 21}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "# How many events happen on {date}?\n",
    "for index in tqdm.tqdm(range(num_question_per_template)):\n",
    "    date_list = df['date'].unique()\n",
    "    date = random.choice(date_list)\n",
    "    question = \"How many events happen on {} in the agenda table?\".format(date)\n",
    "    answer = len(df[df['date'] == date])\n",
    "    questions.append({\"qid\": \"hard-agenda-{:0>4d}\".format(question_id), \"question\":question, \"answer\":answer})\n",
    "    question_id += 1\n",
    "print(questions[-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 20/20 [00:00<00:00, 230.82it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'qid': 'hard-agenda-0039', 'question': 'Who is unavailble between 9:00 AM and 2:00 PM on 2022/06/07 in the agenda table?', 'answer': 15}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "def convert_time(current_time: str, time_format: str = \"%I:%M %p\") -> str:\n",
    "    time_obj = datetime.strptime(current_time, time_format)\n",
    "    return time_obj\n",
    "def convert_time2(current_time: str, time_format: str = \"%I %p\") -> str:\n",
    "    time_obj = datetime.strptime(current_time, time_format)\n",
    "    return time_obj\n",
    "\n",
    "# How many people are unavailble between {start_time} and {end_time} on {date}?\n",
    "for index in tqdm.tqdm(range(num_question_per_template)):\n",
    "    date_list = df['date'].unique()\n",
    "    date = random.choice(date_list)\n",
    "    sub_data = df[df['date'] == date]\n",
    "    random_indices = random.randint(0, len(sub_data)-1)\n",
    "    start_time = sub_data.iloc[random_indices]['start_time']\n",
    "    end_time = sub_data.iloc[random_indices]['end_time']\n",
    "    question = \"How many people are unavailble between {} and {} on {} in the agenda table?\".format(start_time, end_time, date)\n",
    "    # answer = len(list(set(df[(df['date'] == date) & (df['end_time'] > start_time) & (df['start_time'] < end_time)]['person'].unique())))\n",
    "    sub_table = df[(df['date'] == date)]\n",
    "    answer = []\n",
    "    for i in range(len(sub_table)):\n",
    "        row = sub_table.iloc[i]\n",
    "        try:\n",
    "            row_start_time = convert_time(row['start_time'])\n",
    "        except:\n",
    "            row_start_time = convert_time2(row['start_time'])\n",
    "        try:\n",
    "            row_end_time = convert_time(row['end_time'])\n",
    "        except:\n",
    "            row_end_time = convert_time2(row['end_time'])\n",
    "        try:\n",
    "            st_time = convert_time(start_time)\n",
    "        except:\n",
    "            st_time = convert_time2(start_time)\n",
    "        try:\n",
    "            ed_time = convert_time(end_time)\n",
    "        except:\n",
    "            ed_time = convert_time2(end_time)\n",
    "        if row_start_time < ed_time and row_end_time > st_time:\n",
    "            answer.append(row['person'])\n",
    "        \n",
    "    answer = len(list(set(answer)))\n",
    "    questions.append({\"qid\": \"hard-agenda-{:0>4d}\".format(question_id), \"question\":question, \"answer\":answer})\n",
    "    question_id += 1\n",
    "print(questions[-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 20/20 [00:00<00:00, 339.14it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'qid': 'hard-agenda-0059', 'question': 'When should I schedule a meeting with Jessica from 9:00 AM to 6:00 PM on 2022/11/28 in the agenda table?', 'answer': '9:00 AM-7:00 PM, 9:00 PM-6:00 PM'}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "def find_available_slots(meetings: list, meeting_duration: int, start_boundary: str, end_boundary: str, time_format: str = \"%I:%M %p\") -> list:\n",
    "    sorted_meetings = sorted(meetings, key=lambda x: datetime.strptime(x[0], time_format))\n",
    "    available_slots = []\n",
    "\n",
    "    start_boundary_obj = datetime.strptime(start_boundary, time_format)\n",
    "    end_boundary_obj = datetime.strptime(end_boundary, time_format)\n",
    "\n",
    "    if (datetime.strptime(sorted_meetings[0][0], time_format) - start_boundary_obj).seconds >= meeting_duration * 60:\n",
    "        available_slots.append((start_boundary, sorted_meetings[0][0]))\n",
    "\n",
    "    for i in range(len(sorted_meetings) - 1):\n",
    "        start_gap = datetime.strptime(sorted_meetings[i][1], time_format)\n",
    "        end_gap = datetime.strptime(sorted_meetings[i + 1][0], time_format)\n",
    "        gap_duration = int((end_gap - start_gap).total_seconds() / 60)\n",
    "\n",
    "        if gap_duration >= meeting_duration:\n",
    "            available_slots.append((sorted_meetings[i][1], sorted_meetings[i + 1][0]))\n",
    "\n",
    "    if (end_boundary_obj - datetime.strptime(sorted_meetings[-1][1], time_format)).seconds >= meeting_duration * 60:\n",
    "        available_slots.append((sorted_meetings[-1][1], end_boundary))\n",
    "\n",
    "    return available_slots\n",
    "# When should I schedule a meeting with {person} on {date}?\n",
    "for index in tqdm.tqdm(range(num_question_per_template)):\n",
    "    date_list = df['date'].unique()\n",
    "    date = random.choice(date_list)\n",
    "    person = random.choice(df[df['date'] == date]['person'].unique())\n",
    "    question = \"When should I schedule a meeting with {} from 9:00 AM to 6:00 PM on {} in the agenda table?\".format(person, date)\n",
    "    sub_data = df[(df['date'] == date) & (df['person'] == person)]\n",
    "    times = []\n",
    "    for i in range(len(sub_data)):\n",
    "        row = sub_data.iloc[i]\n",
    "        times.append((row['start_time'], row['end_time']))\n",
    "    meeting_duration = 60  # Meeting duration in minutes\n",
    "    start_boundary = \"9:00 AM\"\n",
    "    end_boundary = \"6:00 PM\"\n",
    "\n",
    "    available_slots = find_available_slots(times, meeting_duration, start_boundary, end_boundary)\n",
    "    for i in range(len(available_slots)):\n",
    "        available_slots[i] = available_slots[i][0]+'-'+available_slots[i][1]\n",
    "    answer = \", \".join(available_slots)\n",
    "    questions.append({\"qid\": \"hard-agenda-{:0>4d}\".format(question_id), \"question\":question, \"answer\":answer})\n",
    "    question_id += 1\n",
    "print(questions[-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 20/20 [00:00<00:00, 346.89it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'qid': 'hard-agenda-0079', 'question': 'What events does Emily have on 2022/09/19 in the agenda table?', 'answer': 'Theatre performance, Art Walk'}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "# What events does {person} have on {date}?\n",
    "for index in tqdm.tqdm(range(num_question_per_template)):\n",
    "    date_list = df['date'].unique()\n",
    "    date = random.choice(date_list)\n",
    "    person = random.choice(df[df['date'] == date]['person'].unique())\n",
    "    question = \"What events does {} have on {} in the agenda table?\".format(person, date)\n",
    "    answer = \", \".join(list(df[(df['date'] == date) & (df['person'] == person)]['event'].unique()))\n",
    "    questions.append({\"qid\": \"hard-agenda-{:0>4d}\".format(question_id), \"question\":question, \"answer\":answer})\n",
    "    question_id += 1\n",
    "print(questions[-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 20/20 [00:00<00:00, 680.72it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'qid': 'hard-agenda-0099', 'question': 'How many dates in the agenda table have Imogen scheduled?', 'answer': 88}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    " # How many dates in the agenda table have {person} scheduled?\n",
    "for index in tqdm.tqdm(range(num_question_per_template)):\n",
    "    person_list = df['person'].unique()\n",
    "    person = random.choice(person_list)\n",
    "    question = \"How many dates in the agenda table have {} scheduled?\".format(person)\n",
    "    answer = len(df[df['person'] == person]['date'].unique())\n",
    "    questions.append({\"qid\": \"hard-agenda-{:0>4d}\".format(question_id), \"question\":question, \"answer\":answer})\n",
    "    question_id += 1\n",
    "print(questions[-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "with jsonlines.open('/<YOUR_OWN_PATH>/ToolQA/data/questions/hard/genda-hard.jsonl', mode='w') as writer:\n",
    "    for row in questions:\n",
    "        writer.write(row)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "llm",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.16"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
